package util

import (
	"net/http"
	"net/url"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"golang.org/x/text/encoding/simplifiedchinese"
)

const (
	GBK     string = "GBK"
	UTF8    string = "UTF8"
	UNKNOWN string = "UNKNOWN"
)

const (
	href         = "href"
	content      = "content"
	defaultTitle = "无标题"
	defaultDesc  = "无描述"
)

type UrlParse struct {
	Link  string `json:"link"`  // 链接地址
	Title string `json:"title"` // 链接标题
	Desc  string `json:"desc"`  // 链接描述
	Image string `json:"image"` // 链接封面
}

func UrlCrawler(link string) (urlParse UrlParse, err error) {
	// 替换为http请求
	link = strings.ReplaceAll(link, "https://", "http://")
	urlParse.Link = link
	// 请求html页面
	res, err := http.Get(link)
	if err != nil {
		return
	}
	defer res.Body.Close()
	if res.StatusCode != 200 {
		return
	}
	// 加载 HTML document对象
	doc, err := goquery.NewDocumentFromReader(res.Body)
	if err != nil {
		return
	}
	// 获取网站标题
	title, err := getTitle(doc)
	if err != nil {
		return
	}
	urlParse.Title = title

	// 获取网站描述
	desc, err := getDesc(doc)
	if err != nil {
		return
	}
	urlParse.Desc = desc

	// 获取网站封面
	icon, err := getIcon(doc, link)
	if err != nil {
		return
	}
	urlParse.Image = icon
	return
}

func getTitle(doc *goquery.Document) (title string, err error) {
	title = doc.Find("title").Text()
	if title == "" {
		title = getElementVal(doc, "meta[property='og:title']", content)
	}
	if title == "" {
		title = getElementVal(doc, "meta[name='og:title']", content)
	}
	if title == "" {
		title = getElementVal(doc, "meta[property='twitter:title']", content)
	}
	if title == "" {
		title = getElementVal(doc, "meta[name='twitter:title']", content)
	}
	if title == "" {
		// 没有获取到，设置默认值
		title = defaultTitle
		return
	}
	// UTF-8转换
	title, err = transformUtf8(title)
	if err != nil {
		return
	}
	// 去除多余的空字符
	title = strings.TrimSpace(title)
	return
}

func getDesc(doc *goquery.Document) (desc string, err error) {
	desc = getElementVal(doc, "meta[name='description']", content)
	if desc == "" {
		desc = getElementVal(doc, "meta[property='og:description']", content)
	}
	if desc == "" {
		desc = getElementVal(doc, "meta[name='og:description']", content)
	}
	if desc == "" {
		desc = getElementVal(doc, "meta[property='twitter:description']", content)
	}
	if desc == "" {
		desc = getElementVal(doc, "meta[name='twitter:description']", content)
	}
	if desc == "" {
		desc = doc.Find("div").Text()
	}
	if desc == "" {
		// 没有获取到，设置默认值
		desc = defaultDesc
		return
	}
	// UTF-8转换
	desc, err = transformUtf8(desc)
	if err != nil {
		return
	}
	desc = strings.ReplaceAll(desc, "\n", "")
	desc = strings.TrimSpace(desc)
	descRunes := []rune(desc)
	if len(descRunes) >= 150 {
		desc = string(descRunes[:150])
	}
	return
}

func getIcon(doc *goquery.Document, link string) (icon string, err error) {
	icon = getElementVal(doc, "link[rel='shortcut icon']", href)
	if icon == "" {
		icon = getElementVal(doc, "link[rel='SHORTCUT ICON']", href)
	}
	if icon == "" {
		icon = getElementVal(doc, "link[rel='icon']", href)
	}
	if icon == "" {
		return
	}
	// http开头，直接退出
	if strings.HasPrefix(icon, "http") {
		return
	}
	// 如果以'//'开头，则拼接上'http:'前缀
	if strings.HasPrefix(icon, "//") {
		icon = "http:" + icon
	} else {
		// 既不是以'http'开头，也不是'//'开头，则需要拼接上网站的域名
		urlDetail, err := url.Parse(link)
		if err != nil {
			return icon, err
		}
		icon = urlDetail.Scheme + "://" + urlDetail.Hostname() + icon
	}
	return
}

func getElementVal(doc *goquery.Document, selector, key string) (val string) {
	elements := doc.Find(selector)
	if len(elements.Nodes) > 0 {
		attr := elements.Get(0).Attr
		for _, attribute := range attr {
			if attribute.Key == key {
				val = attribute.Val
				return
			}
		}
	}
	return
}

func transformUtf8(sourceStr string) (utf8Str string, err error) {
	descBytes := []byte(sourceStr)
	coding := GetStrCoding(descBytes)
	// 判断是否是utf-8
	if coding != UTF8 {
		// 将gbk转换为utf-8
		descBytes, err = simplifiedchinese.GBK.NewDecoder().Bytes(descBytes)
		if err != nil {
			return
		}
		utf8Str = string(descBytes)
	} else {
		utf8Str = sourceStr
	}
	return
}

func isGBK(data []byte) bool {
	length := len(data)
	var i = 0
	for i < length {
		if data[i] <= 0x7f {
			// 编码0~127,只有一个字节的编码，兼容ASCII码
			i++
			continue
		} else {
			// 大于127的使用双字节编码，落在gbk编码范围内的字符
			if data[i] >= 0x81 &&
				data[i] <= 0xfe &&
				data[i+1] >= 0x40 &&
				data[i+1] <= 0xfe &&
				data[i+1] != 0xf7 {
				i += 2
				continue
			} else {
				return false
			}
		}
	}
	return true
}

func isUtf8(data []byte) bool {
	i := 0
	for i < len(data) {
		if (data[i] & 0x80) == 0x00 {
			// 0XXX_XXXX
			i++
			continue
		} else if num := preNUm(data[i]); num > 2 {
			// 110X_XXXX 10XX_XXXX
			// 1110_XXXX 10XX_XXXX 10XX_XXXX
			// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
			// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
			// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
			// preNUm() 返回首个字节的8个bits中首个0bit前面1bit的个数，该数量也是该字符所使用的字节数
			i++
			for j := 0; j < num-1; j++ {
				// 判断后面的 num - 1 个字节是不是都是10开头
				if (data[i] & 0xc0) != 0x80 {
					return false
				}
				i++
			}
		} else {
			// 其他情况说明不是utf-8
			return false
		}
	}
	return true
}

func preNUm(data byte) int {
	var mask byte = 0x80
	var num = 0
	//8bit中首个0bit前有多少个1bits
	for i := 0; i < 8; i++ {
		if (data & mask) == mask {
			num++
			mask = mask >> 1
		} else {
			break
		}
	}
	return num
}

// 需要说明的是，isGBK()是通过双字节是否落在gbk的编码范围内实现的，
// 而utf-8编码格式的每个字节都是落在gbk的编码范围内，
// 所以只有先调用isUtf8()先判断不是utf-8编码，再调用isGBK()才有意义
func GetStrCoding(data []byte) string {
	if isUtf8(data) == true {
		return UTF8
	} else if isGBK(data) == true {
		return GBK
	} else {
		return UNKNOWN
	}
}
